1 Introduction

In this Project the goal is to …

  1. … develop a research question.
  2. … identify Datasets which might help answer these questions.
  3. … investigate the research question.
  4. … Discuss the results.

The Project is developed and executed by Mathias Koch and Bernd Zamberger during the course of “M.BDSC.B.22.WS23 Verarbeitung und Speicherung großer Datenmengen” at the FH Wiener Neustadt during the MS Studies in Bio Data Science. The course is led by D. Iurashev.

1.1 Detailed Task Description

  • Clean the data
  • Understand what is in the data —
    • are the data measurements or estimates?
    • How accurate are these measurements or estimates? Are there biases in the data (e.g. in the data gathering process)?
    • If you use estimates to make new estimates, how accurate are the new estimates?
  • Check for missing data points – decide what to do about them
  • Check for outliers – decide what to do about them
  • Check for inconsistencies – decide what to do about them
  • Calculate descriptive statistics
  • Transform the data (e.g. changing units of measurements)
  • Check if the necessary data is there to answer the questions. If not, then you could: – Combine columns in some way to generate the necessary data – Find the necessary data in another dataset – Change the questions asked (in this case you have the freedom to do this, but this may not be the case if someone else is asking the questions)
  • Visualise the data
  • Calculate correlations
  • Check predictions

2 Research Question

What are the trends in R&D-Expenses in europe? * Can we see differences between countries? * Are there trends which we can identify?

3 Setup

# to get the version of R used in the notebook
paste("The R Version used in this notebook is", getRversion())
## [1] "The R Version used in this notebook is 4.3.2"

specify the CRAN repository where the packages have been downloaded from.

# Define the CRAN repository for this session
r_rep = getOption("repos")
r_rep["CRAN"] = "http://cran.us.r-project.org"
options(repos = r_rep)
# Loading libraries

# First should be installed
# install.packages("eurostat")
# install.packages("rvest")
# install.packages("knitr")
# install.packages("rgdal")
# install.packages("countrycode")
# install.packages("dplyr")
# install.packages("reshape2")
# install.packages("ggplot2")
# install.packages("TraMineR")
# install.packages("cluster")
# install.packages("factoextra")
# install.packages("RColorBrewer")
# install.packages("leaflet")
# install.packages("plotly")
# install.packages("htmlwidgets")

# And then should be loaded
library(tidyr)
library(readr)
library(tidyverse)
library(xts)
library(eurostat)
library(rvest)
library(knitr)
library(rgdal)
library(countrycode)
library(dplyr)
library(reshape2)
library(ggplot2)
library(TraMineR)
library(cluster)
library(factoextra)
library(RColorBrewer)
library(leaflet)
library(plotly)
# library(htmlwidgets)

create a list of used packages

# Create a list with all the available packages in my R environment
pkg_used <- available.packages()

# For every package print the version of the package, the version of R that depends on and the packages that imports
paste("eurostat Version is:", pkg_used["eurostat", "Version"])
## [1] "eurostat Version is: 4.0.0"

paste("rvest Version is:", pkg_used["rvest", "Version"])
## [1] "rvest Version is: 1.0.3"

paste("knitr Version is:", pkg_used["knitr", "Version"])
## [1] "knitr Version is: 1.45"

# paste("rgdal Version is:", pkg_used["rgdal", "Version"])

paste("countrycode Version is:", pkg_used["countrycode", "Version"])
## [1] "countrycode Version is: 1.5.0"

paste("dplyr Version is:", pkg_used["dplyr", "Version"])
## [1] "dplyr Version is: 1.1.4"

paste("reshape2 Version is:", pkg_used["reshape2", "Version"])
## [1] "reshape2 Version is: 1.4.4"

paste("ggplot2 Version is:", pkg_used["ggplot2", "Version"])
## [1] "ggplot2 Version is: 3.4.4"

paste("TraMineR Version is:", pkg_used["TraMineR", "Version"])
## [1] "TraMineR Version is: 2.2-9"

paste("cluster Version is:", pkg_used["cluster", "Version"])
## [1] "cluster Version is: 2.1.6"

paste("factoextra Version is:", pkg_used["factoextra", "Version"])
## [1] "factoextra Version is: 1.0.7"

paste("RColorBrewer Version is:", pkg_used["RColorBrewer", "Version"])
## [1] "RColorBrewer Version is: 1.1-3"

paste("leaflet Version is:", pkg_used["leaflet", "Version"])
## [1] "leaflet Version is: 2.2.1"

paste("plotly Version is:", pkg_used["plotly", "Version"])
## [1] "plotly Version is: 4.10.3"

3.1 set seed

set.seed(202375)

4 Data

4.1 Downloads

4.1.1 Download the dataset

# Specify the ID of the dataset required

id <- "rd_e_berdpfr2"

# Request of the dataset
rd_e_berd <- get_eurostat(id, time_format = "num")

# filter the dataset down to the selected countries
# filter based on the categories in unit
rd_e_berd <- rd_e_berd %>% filter(unit %in% c("MIO_EUR"))

# unique(rd_e_berd$unit)
# Countries of Intrest
country_code_1 <- c("BE","CZ","DK","DE","ES","FR","IT","NL","AT","PL","FI","SE", "EU27")

# EU 28
country_code_2 <- c("AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE","GR","HU","IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI","ES","SE","GB")

# EU 28 + (CH, NO)
country_code_3 <- c("AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE","GR","HU","IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI","ES","SE","GB", "CH", "NO")
# filter the countries of interest
rd_e_berd <- rd_e_berd %>%
 filter(`geo` %in% country_code_3)

# filter to only TOTAL values
rd_e_berd <- rd_e_berd %>% filter(nace_r2 == "TOTAL")

(rd_e_berd)

4.1.2 Download the spatial data

# Specifying data directories
home_dir <-"/proj/courses/2023_mg/cooler/Verarbeitung_Datenmengen/Project_B_Koch_Zamberger"
geo_dir <- file.path(home_dir, "geo")

# Specify the url that links to the zipped spatial datasets
url = "http://ec.europa.eu/eurostat/cache/GISCO/distribution/v2/nuts/download/ref-nuts-2016-60m.shp.zip"

destfile_geo<-file.path(geo_dir, "ref-nuts-2016-60m.shp.zip")

# Download the file
download.file(url, destfile_geo)

# Unzip the bulk file
unzip(destfile_geo, exdir = geo_dir)

# Unzip the specific shapefile needed
unzip(paste0(geo_dir, "/NUTS_RG_60M_2016_4326_LEVL_2.shp.zip"), exdir = geo_dir)

# Read in the shapefile
geodata <- readOGR(dsn = geo_dir, layer = "NUTS_RG_60M_2016_4326_LEVL_2")
## OGR data source with driver: ESRI Shapefile 
## Source: "/proj/courses/2023_mg/cooler/Verarbeitung_Datenmengen/Project_B_Koch_Zamberger/geo", layer: "NUTS_RG_60M_2016_4326_LEVL_2"
## with 332 features
## It has 9 fields
geodata_rd<-geodata

4.2 country names

# Create a new column for country names
geodata_rd@data$cntr_name <- countrycode(geodata_rd@data$CNTR_CODE, "iso2c", "country.name")

# Because European commission uses EL for Greece (in ISO 3166-1 alpha-2 codes is GR) and UK for United Kingdom (in ISO 3166-1 alpha-2 codes is GB) I should replace these two countries manually
geodata_rd@data$cntr_name <- ifelse(geodata_rd@data$CNTR_CODE=="EL","Greece", ifelse(geodata_rd@data$CNTR_CODE=="UK", "United Kingdom", geodata_rd@data$cntr_name))

4.3 Count NA’s

Creating a count_na function to count NA-values

count_na_func <- function(x) sum(is.na(x)) 

# count the missing values and save the value in a new column
rd_e_berd$count_na <- apply(rd_e_berd, 1, count_na_func)

# How many NA values do we have
sum(rd_e_berd$count_na)/length(rd_e_berd$values)
## [1] 0
summary(rd_e_berd$values)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    12.16   242.67  1570.51  6217.01  6781.37 81808.90

4.3.1 Cleanup of the dataset

# Create a new column to store the number of characters of the geography
rd_e_berd$n_char <- nchar(as.character(rd_e_berd$geo))

# Subset only the Countries - their geography code contains 2 characters  

rd_e_berd_NUTS2 <- rd_e_berd %>% 
  filter(n_char == 2)

The key stages followed in this notebook are described below with links to the particular sub-sections which provide some further technical clarifications:

  1. [Data pre-processing], by classifying NUTS 2 regions into quintiles based on their total R&D spending each year from 2005 to 2022 (regions with the lowest R&D spending belong to the 1st quintile and regions with the highest R&D spending belong to the 5th quintile).
  2. Create a sequence object based on the quintile each region belongs to in every year (i.e. from 2005 to 2022).
  3. [Measuring sequence dissimilarity] based on substitution costs which is the probability of transitioning from one quintile to another (i.e. higher transition rate from 5th quintile to 1st quintile rather than from 2nd quintile to 3rd quintile).
  4. Using the substitution costs calculated in the previous stage, a dissimilarity matrix was built including every pair of sequences. In this notebook the Optimal Matching (OM) algorithm was used. The algorithm substitutes the elements of each sequence based on their substitution costs which in turn is the OM distance between each pair of sequences.
  5. In the last stage R&D spending trajectories were made using the resulting dissimilarity matrix from stage 4. Partitioning Around Medoids (PAM) clustering algorithm is used for the classification of sequences

5 Data Analysis

# I change the years from numbers to characters so to be recognised as categorical rather than continuous variable
head(rd_e_berd)
rd_e_berd$TIME_PERIOD <- as.character(rd_e_berd$TIME_PERIOD)
# Create a plot showing the rd expenses over the timeperiod

rd_e_berd %>% 
  group_by(TIME_PERIOD) %>% 
  dplyr::filter(nace_r2=="TOTAL") %>%
  summarise_all(mean, na.rm = TRUE) %>% 
  ggplot() +
  geom_bar(aes(x = TIME_PERIOD, y = values), stat = "identity", fill = "coral2") +
  labs(title = "BERD for timeperiod",
     x = "Year",
     y = "Value",
     caption = "Data downloaded from Eurostat\ncalculations made by the author") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate quintiles by year 
# It is good to specify  the filter function to be used from dplyr function to avoid error messages



quant_data_rd <- NULL
for (var in unique(rd_e_berd_NUTS2$TIME_PERIOD)) {
    rd_e_berd_NUTS2_temp <- rd_e_berd_NUTS2 %>% 
      dplyr::filter(TIME_PERIOD == var) %>% 
      mutate(quintiles = ntile(values, 5) )
    quant_data_rd <- rbind(quant_data_rd,rd_e_berd_NUTS2_temp)
}

quant_data_rd # every entry is placed into a category depended on their value and the quintiles calculated
# drop the percentage values
quant_data_rd <- subset(quant_data_rd, select = -values)

# Re-format the data from long to wide format
# This means that every row will represent a region and every column represents a year

quant_data_wide_rd <-
  dcast(quant_data_rd,
  nace_r2 + unit + geo + n_char ~ TIME_PERIOD,
  value.var = 'quintiles')
    # You can use the dcast function from the data.table package in R to reshape a data frame from a long format to a wide format.

# We remove rows that do not have values in at least one year so we have consistency between sequences
quant_data_wide_rd <- na.omit(quant_data_wide_rd)

# Have a look at the dataset
str(quant_data_wide_rd)
## 'data.frame':    26 obs. of  22 variables:
##  $ nace_r2: chr  "TOTAL" "TOTAL" "TOTAL" "TOTAL" ...
##  $ unit   : chr  "MIO_EUR" "MIO_EUR" "MIO_EUR" "MIO_EUR" ...
##  $ geo    : chr  "AT" "BE" "BG" "CY" ...
##  $ n_char : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ 2005   : int  4 4 1 1 3 5 1 5 4 5 ...
##  $ 2006   : int  4 4 1 1 3 5 1 5 4 5 ...
##  $ 2007   : int  4 4 1 1 3 5 1 5 4 5 ...
##  $ 2008   : int  4 3 1 1 3 5 1 5 4 5 ...
##  $ 2009   : int  4 4 1 1 3 5 1 5 4 5 ...
##  $ 2010   : int  4 4 1 1 3 5 1 5 4 5 ...
##  $ 2011   : int  4 4 1 1 3 5 2 5 4 5 ...
##  $ 2012   : int  4 4 1 1 3 5 2 4 3 5 ...
##  $ 2013   : int  4 4 1 1 3 5 1 4 4 5 ...
##  $ 2014   : int  4 4 2 1 3 5 1 4 4 5 ...
##  $ 2015   : int  4 4 2 1 3 5 1 4 3 5 ...
##  $ 2016   : int  4 4 2 1 3 5 1 4 4 5 ...
##  $ 2017   : int  4 4 2 1 3 5 1 4 3 5 ...
##  $ 2018   : int  4 4 2 1 3 5 1 4 4 5 ...
##  $ 2019   : int  4 4 2 1 3 5 1 4 3 5 ...
##  $ 2020   : int  4 4 2 1 3 5 1 4 4 5 ...
##  $ 2021   : int  4 4 2 1 3 5 1 4 3 5 ...
##  $ 2022   : int  4 4 2 1 3 5 1 4 4 5 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 4 8
##   ..- attr(*, "names")= chr [1:2] "4" "8"

5.1 Sequence object

# Create the sequence object using only the quintiles that every region belongs
seq_obj_rd <- seqdef(quant_data_wide_rd[,5:22])
# Calculate substitution costs
subs_costs_rd <- seqsubm(seq_obj_rd, method = "TRATE")

# Print the substitution costs
kable(subs_costs_rd)
1 2 3 4 5
0.000000 1.921569 2.000000 2.000000 2.000000
1.921569 0.000000 1.823853 2.000000 2.000000
2.000000 1.823853 0.000000 1.833525 2.000000
2.000000 2.000000 1.833525 0.000000 1.862873
2.000000 2.000000 2.000000 1.862873 0.000000

5.2 Dissimilarity matrix

# Calculate the distance matrix
seq.OM_rd <- seqdist(seq_obj_rd, method = "OM", sm = subs_costs_rd)

5.3 Classification of sequences

# Assess different clustering solutions to specify the optimal number of clusters

fviz_nbclust(seq.OM_rd, cluster::pam, method = "wss")

# Assess different clustering solutions to specify the optimal number of clusters

fviz_nbclust(seq.OM_rd, cluster::pam, method = "silhouette")

# Run clustering algorithm with k
pam.res_rd <- pam(seq.OM_rd, 6)

5.4 Visualizing the results

# Assign the cluster group into the tabular dataset
quant_data_wide_rd$cluster <- pam.res_rd$clustering

# Then rename clusters 
quant_data_wide_rd$cluster <- factor(quant_data_wide_rd$cluster, levels=c(1, 2, 3, 4, 5, 6),
                                  labels=c("Cluster1",
                                           "Cluster2",
                                           "Cluster3",
                                           "Cluster4",
                                           "Cluster5",
                                           "Cluster6"))
unique(quant_data_wide_rd$cluster)
## [1] Cluster1 Cluster2 Cluster3 Cluster4 Cluster5 Cluster6
## Levels: Cluster1 Cluster2 Cluster3 Cluster4 Cluster5 Cluster6
# Plot of individual sequences split by sequence group
seqIplot(seq_obj_rd, group = quant_data_wide_rd$cluster, ylab = "Number of sequences")

# Distribution plot by sequence group
# seqdplot(seq_obj_rd, group = quant_data_wide_rd$cluster, border=NA, ylab = "Distribution of sequences")